library(ggplot2)
library(MASS) # for the example dataset
library(plyr) # for recoding data
library(ROCR) # for plotting roc
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(e1071) # for NB and SVM
library(rpart) # for decision tree
library(tree)
library(ada) # for adaboost
library(class)#for KNN
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Loading required package: lattice
library(forecast)
library(tseries)
library(leaflet)
read booking information
a_booked_time=read.csv("air_reserve.csv",header=TRUE, sep=",") # store_id, bookingdatetime, visitdatetime
#summary(a_booked_time)
cal=read.csv("date_info.csv",header=TRUE, sep=",")
head(cal)
## calendar_date day_of_week holiday_flg
## 1 2016-01-01 Friday 1
## 2 2016-01-02 Saturday 1
## 3 2016-01-03 Sunday 1
## 4 2016-01-04 Monday 0
## 5 2016-01-05 Tuesday 0
## 6 2016-01-06 Wednesday 0
Changing all date_time information to date only, as we do not have enough information to do hourly prediction
visitDateTime=as.factor(a_booked_time$visit_datetime)
visitDate=substr(visitDateTime, start = 1, stop = 10)
reserveDateTime=as.factor(a_booked_time$reserve_datetime)
reserveDate=substr(reserveDateTime, start = 1, stop = 10)
visitDate=as.Date(visitDate)
reserveDate=as.Date(reserveDate)
a_booked=a_booked_time
a_booked$visit_datetime=visitDate
a_booked$reserve_datetime=reserveDate
head(a_booked)
## air_store_id visit_datetime reserve_datetime reserve_visitors
## 1 air_877f79706adbfb06 2016-01-01 2016-01-01 1
## 2 air_db4b38ebe7a7ceff 2016-01-01 2016-01-01 3
## 3 air_db4b38ebe7a7ceff 2016-01-01 2016-01-01 6
## 4 air_877f79706adbfb06 2016-01-01 2016-01-01 2
## 5 air_db80363d35f10926 2016-01-01 2016-01-01 5
## 6 air_db80363d35f10926 2016-01-02 2016-01-01 2
importing store location information
a_store_addr=read.csv("air_store_info.csv",header=TRUE, sep=",")
head(a_store_addr)
## air_store_id air_genre_name air_area_name
## 1 air_0f0cdeee6c9bf3d7 Italian/French Hyōgo-ken Kōbe-shi Kumoidōri
## 2 air_7cc17a324ae5c7dc Italian/French Hyōgo-ken Kōbe-shi Kumoidōri
## 3 air_fee8dcf4d619598e Italian/French Hyōgo-ken Kōbe-shi Kumoidōri
## 4 air_a17f0778617c76e2 Italian/French Hyōgo-ken Kōbe-shi Kumoidōri
## 5 air_83db5aff8f50478e Italian/French Tōkyō-to Minato-ku Shibakōen
## 6 air_99c3eae84130c1cb Italian/French Tōkyō-to Minato-ku Shibakōen
## latitude longitude
## 1 34.69512 135.1979
## 2 34.69512 135.1979
## 3 34.69512 135.1979
## 4 34.69512 135.1979
## 5 35.65807 139.7516
## 6 35.65807 139.7516
dim(a_store_addr)
## [1] 829 5
importing store visits per day information
a_visited=read.csv("air_visit_data.csv",header=TRUE, sep=",")
#head(a_visited)
dim(a_visited)
## [1] 252108 3
head(a_visited)
## air_store_id visit_date visitors
## 1 air_ba937bf13d40fb24 2016-01-13 25
## 2 air_ba937bf13d40fb24 2016-01-14 32
## 3 air_ba937bf13d40fb24 2016-01-15 29
## 4 air_ba937bf13d40fb24 2016-01-16 22
## 5 air_ba937bf13d40fb24 2016-01-18 6
## 6 air_ba937bf13d40fb24 2016-01-19 9
plotting number of visitors for all the restaurants togather throughout the time period
a_visited_cal=merge(a_visited,cal, by.x="visit_date",by.y="calendar_date")
visitDate=(a_visited$visit_date)
visitDate=toString(visitDate)
a_visited$visit_date=as.Date(a_visited$visit_date, format = "%Y-%m-%d")
plot(a_visited$visit_date)
ggplot(a_visited, aes(x =visit_date,y=visitors))+geom_point()
analysing cusine information
#plot(a_store_addr, x = air_area_name)
ggplot(a_store_addr, aes(x = air_genre_name)) + geom_bar()+facet_wrap(~air_genre_name, nrow = 2)+theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank())
summary(a_store_addr$air_genre_name)
## Asian Bar/Cocktail
## 2 79
## Cafe/Sweets Creative cuisine
## 181 13
## Dining bar International cuisine
## 108 2
## Italian/French Izakaya
## 102 197
## Japanese food Karaoke/Party
## 63 2
## Okonomiyaki/Monja/Teppanyaki Other
## 14 27
## Western food Yakiniku/Korean food
## 16 23
dim(a_store_addr)
## [1] 829 5
head(a_store_addr)
## air_store_id air_genre_name air_area_name
## 1 air_0f0cdeee6c9bf3d7 Italian/French Hyōgo-ken Kōbe-shi Kumoidōri
## 2 air_7cc17a324ae5c7dc Italian/French Hyōgo-ken Kōbe-shi Kumoidōri
## 3 air_fee8dcf4d619598e Italian/French Hyōgo-ken Kōbe-shi Kumoidōri
## 4 air_a17f0778617c76e2 Italian/French Hyōgo-ken Kōbe-shi Kumoidōri
## 5 air_83db5aff8f50478e Italian/French Tōkyō-to Minato-ku Shibakōen
## 6 air_99c3eae84130c1cb Italian/French Tōkyō-to Minato-ku Shibakōen
## latitude longitude
## 1 34.69512 135.1979
## 2 34.69512 135.1979
## 3 34.69512 135.1979
## 4 34.69512 135.1979
## 5 35.65807 139.7516
## 6 35.65807 139.7516
xy=c(levels(a_store_addr$air_genre_name))
a_store_addr$air_genre_name[which(a_store_addr$air_genre_name %in% c("Asian", "Karaoke/Party", "International cuisine"))] <- factor("Other")
a_store_addr$air_genre_name=factor(a_store_addr$air_genre_name)
summary(a_store_addr$air_genre_name)
## Bar/Cocktail Cafe/Sweets
## 79 181
## Creative cuisine Dining bar
## 13 108
## Italian/French Izakaya
## 102 197
## Japanese food Okonomiyaki/Monja/Teppanyaki
## 63 14
## Other Western food
## 33 16
## Yakiniku/Korean food
## 23
ggplot(a_store_addr, aes(x = air_genre_name)) + geom_bar()+facet_wrap(~air_genre_name, nrow = 2)+theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank())
analysing number of visitors v/s day of the week analysing number of visitors v/s if it is a holiday
ggplot(a_visited_cal, aes(x = visitors)) + geom_density(aes(group=day_of_week, fill=day_of_week), alpha=.5)+ xlim(0, 150)
## Warning: Removed 76 rows containing non-finite values (stat_density).
day.comp = aov(visitors ~ day_of_week, data =a_visited_cal )
summary(day.comp)
## Df Sum Sq Mean Sq F value Pr(>F)
## day_of_week 6 2670533 445089 1647 <2e-16 ***
## Residuals 252101 68120431 270
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pairwise.t.test(a_visited_cal$visitors, a_visited_cal$day_of_week)
##
## Pairwise comparisons using t tests with pooled SD
##
## data: a_visited_cal$visitors and a_visited_cal$day_of_week
##
## Friday Monday Saturday Sunday Thursday Tuesday
## Monday < 2e-16 - - - - -
## Saturday < 2e-16 < 2e-16 - - - -
## Sunday 5e-10 < 2e-16 < 2e-16 - - -
## Thursday < 2e-16 < 2e-16 < 2e-16 < 2e-16 - -
## Tuesday < 2e-16 0.00018 < 2e-16 < 2e-16 < 2e-16 -
## Wednesday < 2e-16 < 2e-16 < 2e-16 < 2e-16 0.01055 < 2e-16
##
## P value adjustment method: holm
temp = list.files(path="./weather/",pattern="*.csv")
temp2=temp
temp2=paste("./weather/",temp, sep='')
n=dim(as.data.frame(temp))[1]
w.data=read.csv(temp2[1], header = TRUE, sep=",")
data.prac=data.frame(w.data$calendar_date)
data.mint=data.frame(w.data$calendar_date)
data.maxt=data.frame(w.data$calendar_date)
data.avgt=data.frame(w.data$calendar_date)
data.hrsun=data.frame(w.data$calendar_date)
data.windsp=data.frame(w.data$calendar_date)
for(i in 1:n){
w.data=read.csv(temp2[i], header = TRUE, sep=",")
row.names(w.data)
data.prac =cbind(data.prac, w.data$precipitation)
data.mint=cbind(data.mint, w.data$low_temperature)
data.maxt =cbind(data.maxt, w.data$high_temperature)
data.avgt =cbind(data.prac, w.data$avg_temperature)
data.hrsun =cbind(data.prac, w.data$hours_sunlight)
data.windsp =cbind(data.prac, w.data$avg_wind_speed)
names(data.prac)[names(data.prac)=="w.data$precipitation"] <- temp[i]
names(data.mint)[names(data.mint)=="w.data$low_temprature"] <- temp[i]
names(data.maxt)[names(data.prac)=="w.data$high_temprature"] <- temp[i]
names(data.avgt)[names(data.prac)=="w.data$avg_temprature"] <- temp[i]
names(data.hrsun)[names(data.prac)=="w.data$hours_sunlight"] <- temp[i]
names(data.windsp)[names(data.prac)=="w.data$avg_wind_speed"] <- temp[i]
}
head(a_visited_cal)
## visit_date air_store_id visitors day_of_week holiday_flg
## 1 2016-01-01 air_c31472d14e29cee8 3 Friday 1
## 2 2016-01-01 air_05c325d315cc17f5 29 Friday 1
## 3 2016-01-01 air_81c5dff692063446 7 Friday 1
## 4 2016-01-01 air_a083834e7ffe187e 27 Friday 1
## 5 2016-01-01 air_36bcf77d3382d36e 34 Friday 1
## 6 2016-01-01 air_09a845d5b5944b01 56 Friday 1
ggplot(a_visited_cal, aes(x = visitors)) + geom_density(aes(group=holiday_flg, fill=holiday_flg), alpha=.5)+ xlim(0, 150)
## Warning: Removed 76 rows containing non-finite values (stat_density).
day.comp = aov(visitors ~ holiday_flg, data =a_visited_cal )
summary(day.comp)
## Df Sum Sq Mean Sq F value Pr(>F)
## holiday_flg 1 100261 100261 357.6 <2e-16 ***
## Residuals 252106 70690702 280
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pairwise.t.test(a_visited_cal$visitors, a_visited_cal$holiday_flg)
##
## Pairwise comparisons using t tests with pooled SD
##
## data: a_visited_cal$visitors and a_visited_cal$holiday_flg
##
## 0
## 1 <2e-16
##
## P value adjustment method: holm
Seems like holiday information is also significantly useful with t test.
Mapping of the restaurants. This visual would make it easier to see the areas where the most of the restaurants are located. As can be seen from the map, there are top three areas: Tokyo, Kyoto and Osaka
map_air <- leaflet(a_store_addr) %>% addTiles('http://{s}.basemaps.cartocdn.com/dark_all/{z}/{x}/{y}.png')
map_air %>% addCircles(~longitude, ~latitude, popup=a_store_addr$air_area_name, weight = 3, radius=40,
color="#ffa500", stroke = TRUE, fillOpacity = 0.8)
ggplot(a_visited_cal, aes(x = visitors)) + stat_ecdf()+scale_x_continuous(name="No of visitors",breaks=c(0,5,7,9,13,15,17,20,22,25,30,35,40,50,60,70,100,150), limits=c(0,150))+scale_y_continuous(name="cdf",breaks=c(0,0.2,0.4,0.6,0.8,1.0), limits=c(0,1))
## Warning: Removed 76 rows containing non-finite values (stat_ecdf).
summary(a_visited_cal$visitors)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 9.00 17.00 20.97 29.00 877.00
So, if we devide the dataset into 5 classes, our classes would be: Class1: no of visitors betwen 0-6, size of 7 Class2: no of visitors betwen 7-14, size of 8 Class3: no of visitors betwen 14-22, size of 8 Class4: no of visitors betwen 23-35, size of 13 Class5: no of visitors betwen 35 and more The median is 17. And we devide the data on 17 if we use ony 2 classes to keep the data balanced.
head(a_store_addr)
## air_store_id air_genre_name air_area_name
## 1 air_0f0cdeee6c9bf3d7 Italian/French Hyōgo-ken Kōbe-shi Kumoidōri
## 2 air_7cc17a324ae5c7dc Italian/French Hyōgo-ken Kōbe-shi Kumoidōri
## 3 air_fee8dcf4d619598e Italian/French Hyōgo-ken Kōbe-shi Kumoidōri
## 4 air_a17f0778617c76e2 Italian/French Hyōgo-ken Kōbe-shi Kumoidōri
## 5 air_83db5aff8f50478e Italian/French Tōkyō-to Minato-ku Shibakōen
## 6 air_99c3eae84130c1cb Italian/French Tōkyō-to Minato-ku Shibakōen
## latitude longitude
## 1 34.69512 135.1979
## 2 34.69512 135.1979
## 3 34.69512 135.1979
## 4 34.69512 135.1979
## 5 35.65807 139.7516
## 6 35.65807 139.7516
dim(a_store_addr)
## [1] 829 5
s Adding all the reservations until taday for next week.
air_vis1 <- a_booked %>%
filter((as.Date(visit_datetime)-as.Date(reserve_datetime))>7) %>%
group_by(visit_datetime,air_store_id) %>%
summarize(vis=sum(reserve_visitors))
colnames(air_vis1)[colnames(air_vis1)=="visit_datetime"] <- "visit_date"
colnames(air_vis1)[colnames(air_vis1)=="vis"] <- "bookingsTillLastWeek"
head(air_vis1)
## # A tibble: 6 x 3
## visit_date air_store_id bookingsTillLastWeek
## <date> <fct> <int>
## 1 2016-01-09 air_54ed43163b7596c4 5
## 2 2016-01-09 air_ee3a01f0c71a769f 3
## 3 2016-01-11 air_ee3a01f0c71a769f 2
## 4 2016-01-12 air_24b9b2a020826ede 3
## 5 2016-01-12 air_3c05c8f26c611eb9 10
## 6 2016-01-12 air_6d65542aa43b598b 2
Or Adding all the reservations(not for predicting future)
air_vis2 <- a_booked %>%
group_by(visit_datetime,air_store_id) %>%
summarize(vis=sum(reserve_visitors))
head(air_vis2)
## # A tibble: 6 x 3
## # Groups: visit_datetime [2]
## visit_datetime air_store_id vis
## <date> <fct> <int>
## 1 2016-01-01 air_877f79706adbfb06 3
## 2 2016-01-01 air_db4b38ebe7a7ceff 9
## 3 2016-01-01 air_db80363d35f10926 5
## 4 2016-01-02 air_2b8b29ddfd35018e 6
## 5 2016-01-02 air_3bb99a1fe0583897 4
## 6 2016-01-02 air_6b15edd1b4fbb96a 11
Merging booking done one week in advance with the actual visits.
head(air_vis1)
## # A tibble: 6 x 3
## visit_date air_store_id bookingsTillLastWeek
## <date> <fct> <int>
## 1 2016-01-09 air_54ed43163b7596c4 5
## 2 2016-01-09 air_ee3a01f0c71a769f 3
## 3 2016-01-11 air_ee3a01f0c71a769f 2
## 4 2016-01-12 air_24b9b2a020826ede 3
## 5 2016-01-12 air_3c05c8f26c611eb9 10
## 6 2016-01-12 air_6d65542aa43b598b 2
head(a_visited)
## air_store_id visit_date visitors
## 1 air_ba937bf13d40fb24 2016-01-13 25
## 2 air_ba937bf13d40fb24 2016-01-14 32
## 3 air_ba937bf13d40fb24 2016-01-15 29
## 4 air_ba937bf13d40fb24 2016-01-16 22
## 5 air_ba937bf13d40fb24 2016-01-18 6
## 6 air_ba937bf13d40fb24 2016-01-19 9
airBV=merge(air_vis1, a_visited, all=TRUE)
head(airBV)
## visit_date air_store_id bookingsTillLastWeek visitors
## 1 2016-01-01 air_04341b588bde96cd NA 10
## 2 2016-01-01 air_70e9e8cd55879414 NA 4
## 3 2016-01-01 air_7cc17a324ae5c7dc NA 2
## 4 2016-01-01 air_81c5dff692063446 NA 7
## 5 2016-01-01 air_877f79706adbfb06 NA 3
## 6 2016-01-01 air_87f9e1024b951f01 NA 17
airBV$visitors[is.na(airBV$visitors)] <- 0
head(airBV)
## visit_date air_store_id bookingsTillLastWeek visitors
## 1 2016-01-01 air_04341b588bde96cd NA 10
## 2 2016-01-01 air_70e9e8cd55879414 NA 4
## 3 2016-01-01 air_7cc17a324ae5c7dc NA 2
## 4 2016-01-01 air_81c5dff692063446 NA 7
## 5 2016-01-01 air_877f79706adbfb06 NA 3
## 6 2016-01-01 air_87f9e1024b951f01 NA 17
airBV$bookingsTillLastWeek[is.na(airBV$bookingsTillLastWeek)] <- 0
head(airBV)
## visit_date air_store_id bookingsTillLastWeek visitors
## 1 2016-01-01 air_04341b588bde96cd 0 10
## 2 2016-01-01 air_70e9e8cd55879414 0 4
## 3 2016-01-01 air_7cc17a324ae5c7dc 0 2
## 4 2016-01-01 air_81c5dff692063446 0 7
## 5 2016-01-01 air_877f79706adbfb06 0 3
## 6 2016-01-01 air_87f9e1024b951f01 0 17
airBV[0:50,]
## visit_date air_store_id bookingsTillLastWeek visitors
## 1 2016-01-01 air_04341b588bde96cd 0 10
## 2 2016-01-01 air_70e9e8cd55879414 0 4
## 3 2016-01-01 air_7cc17a324ae5c7dc 0 2
## 4 2016-01-01 air_81c5dff692063446 0 7
## 5 2016-01-01 air_877f79706adbfb06 0 3
## 6 2016-01-01 air_87f9e1024b951f01 0 17
## 7 2016-01-01 air_883ca28ef0ed3d55 0 4
## 8 2016-01-01 air_9d452a881f7f2bb7 0 1
## 9 2016-01-01 air_a083834e7ffe187e 0 27
## 10 2016-01-01 air_cb7467aed805e7fe 0 33
## 11 2016-01-01 air_db4b38ebe7a7ceff 0 21
## 12 2016-01-01 air_db80363d35f10926 0 8
## 13 2016-01-01 air_f690c42545146e0a 0 7
## 14 2016-01-01 air_fe22ef5a9cbef123 0 21
## 15 2016-01-01 air_05c325d315cc17f5 0 29
## 16 2016-01-01 air_08ba8cd01b3ba010 0 11
## 17 2016-01-01 air_09a845d5b5944b01 0 56
## 18 2016-01-01 air_1f7f8fa557bc0d55 0 6
## 19 2016-01-01 air_2570ccb93badde68 0 43
## 20 2016-01-01 air_25d8e5cc57dd87d9 0 1
## 21 2016-01-01 air_298513175efdf261 0 12
## 22 2016-01-01 air_35c4732dcbfe31be 0 16
## 23 2016-01-01 air_36bcf77d3382d36e 0 34
## 24 2016-01-01 air_39dccf7df20b1c6a 0 55
## 25 2016-01-01 air_3b6438b125086430 0 10
## 26 2016-01-01 air_506fe758114df773 0 4
## 27 2016-01-01 air_536043fcf1a4f8a4 0 42
## 28 2016-01-01 air_57ed725a1930a5b9 0 33
## 29 2016-01-01 air_5b6d18c470bbfaf9 0 21
## 30 2016-01-01 air_5c65468938c07fa5 0 8
## 31 2016-01-01 air_60a7057184ec7ec7 0 64
## 32 2016-01-01 air_6e3fd96320d24324 0 32
## 33 2016-01-01 air_71903025d39a4571 0 5
## 34 2016-01-01 air_79f528087f49df06 0 42
## 35 2016-01-01 air_8d50c64692322dff 0 1
## 36 2016-01-01 air_b2d8bc9c88b85f96 0 16
## 37 2016-01-01 air_c31472d14e29cee8 0 3
## 38 2016-01-01 air_c92745dfdd2ec68a 0 31
## 39 2016-01-01 air_cfdeb326418194ff 0 4
## 40 2016-01-01 air_d0a1e69685259c92 0 7
## 41 2016-01-01 air_d0a7bd3339c3d12a 0 62
## 42 2016-01-01 air_d0e8a085d8dc83aa 0 8
## 43 2016-01-01 air_d97dabf7aae60da5 0 102
## 44 2016-01-01 air_e483f5b3c4f310e0 0 1
## 45 2016-01-01 air_eb2d2653586315dd 0 16
## 46 2016-01-01 air_efc80d3f96b3aff7 0 10
## 47 2016-01-01 air_f26f36ec4dc5adb0 0 64
## 48 2016-01-01 air_fab092c35776a9b1 0 19
## 49 2016-01-02 air_142e78ba7001da9c 0 6
## 50 2016-01-02 air_1653a6c513865af3 0 35
summary(airBV)
## visit_date air_store_id bookingsTillLastWeek
## Min. :2016-01-01 air_640cf4835f0d9ba3: 483 Min. : 0.0000
## 1st Qu.:2016-07-23 air_a083834e7ffe187e: 480 1st Qu.: 0.0000
## Median :2016-10-24 air_8093d0b565e9dbdf: 479 Median : 0.0000
## Mean :2016-10-13 air_3c05c8f26c611eb9: 478 Mean : 0.6388
## 3rd Qu.:2017-01-25 air_6b15edd1b4fbb96a: 478 3rd Qu.: 0.0000
## Max. :2017-05-31 air_5c817ef28f236bdf: 477 Max. :2232.0000
## (Other) :250478
## visitors
## Min. : 0.00
## 1st Qu.: 9.00
## Median : 17.00
## Mean : 20.87
## 3rd Qu.: 29.00
## Max. :877.00
##
dim(airBV)
## [1] 253353 4
l{r} library(ggplot2) p <- airBV %>% filter(visitors<400, bookingsTillLastWeek<150) %>% ggplot(aes( bookingsTillLastWeek,visitors)) + geom_point(color = "black", alpha = 0.5) + geom_abline(slope = 1, intercept = 0, color = "grey60") + geom_smooth(method = "lm", color = "blue") p points lower than grey line shows more people booked in advance than visited points upper than grey line shows less people booked one week in advance but visited
Merging bookings, visits, and location information
airBVI=merge(airBV, a_store_addr)
airBVI$visit_date=as.Date(airBVI$visit_date)
head(airBVI)
## air_store_id visit_date bookingsTillLastWeek visitors
## 1 air_00a91d42b08b08d9 2017-01-21 0 11
## 2 air_00a91d42b08b08d9 2017-04-01 0 7
## 3 air_00a91d42b08b08d9 2017-02-21 0 36
## 4 air_00a91d42b08b08d9 2016-07-21 0 47
## 5 air_00a91d42b08b08d9 2017-02-24 0 28
## 6 air_00a91d42b08b08d9 2016-12-22 0 37
## air_genre_name air_area_name latitude longitude
## 1 Italian/French Tōkyō-to Chiyoda-ku Kudanminami 35.694 139.7536
## 2 Italian/French Tōkyō-to Chiyoda-ku Kudanminami 35.694 139.7536
## 3 Italian/French Tōkyō-to Chiyoda-ku Kudanminami 35.694 139.7536
## 4 Italian/French Tōkyō-to Chiyoda-ku Kudanminami 35.694 139.7536
## 5 Italian/French Tōkyō-to Chiyoda-ku Kudanminami 35.694 139.7536
## 6 Italian/French Tōkyō-to Chiyoda-ku Kudanminami 35.694 139.7536
Some preprocessing on calander before merging it with main date
colnames(cal)[colnames(cal)=="calendar_date"]= "visit_date"
cal$visit_date=as.Date(cal$visit_date)
head(cal)
## visit_date day_of_week holiday_flg
## 1 2016-01-01 Friday 1
## 2 2016-01-02 Saturday 1
## 3 2016-01-03 Sunday 1
## 4 2016-01-04 Monday 0
## 5 2016-01-05 Tuesday 0
## 6 2016-01-06 Wednesday 0
Merging bookings, visits, and location, calender information order data by dates
airBVIC=merge(airBVI, cal)
head(airBVIC)
## visit_date air_store_id bookingsTillLastWeek visitors
## 1 2016-01-01 air_05c325d315cc17f5 0 29
## 2 2016-01-01 air_f690c42545146e0a 0 7
## 3 2016-01-01 air_d0a1e69685259c92 0 7
## 4 2016-01-01 air_5b6d18c470bbfaf9 0 21
## 5 2016-01-01 air_60a7057184ec7ec7 0 64
## 6 2016-01-01 air_f26f36ec4dc5adb0 0 64
## air_genre_name air_area_name latitude
## 1 Izakaya Fukuoka-ken Fukuoka-shi Daimyō 33.58922
## 2 Japanese food Hokkaidō Sapporo-shi Minami 3 Jōnishi 43.05682
## 3 Creative cuisine Tōkyō-to Shinjuku-ku Kabukichō 35.69384
## 4 Cafe/Sweets Tōkyō-to Bunkyō-ku Kasuga 35.70807
## 5 Izakaya Fukuoka-ken Fukuoka-shi Torikai 33.57569
## 6 Izakaya Tōkyō-to Shinjuku-ku Kabukichō 35.69384
## longitude day_of_week holiday_flg
## 1 130.3928 Friday 1
## 2 141.3540 Friday 1
## 3 139.7035 Friday 1
## 4 139.7522 Friday 1
## 5 130.3700 Friday 1
## 6 139.7035 Friday 1
airBVIC =airBVIC[order(airBVIC$visit_date),]
head(airBVIC)
## visit_date air_store_id bookingsTillLastWeek visitors
## 1 2016-01-01 air_05c325d315cc17f5 0 29
## 2 2016-01-01 air_f690c42545146e0a 0 7
## 3 2016-01-01 air_d0a1e69685259c92 0 7
## 4 2016-01-01 air_5b6d18c470bbfaf9 0 21
## 5 2016-01-01 air_60a7057184ec7ec7 0 64
## 6 2016-01-01 air_f26f36ec4dc5adb0 0 64
## air_genre_name air_area_name latitude
## 1 Izakaya Fukuoka-ken Fukuoka-shi Daimyō 33.58922
## 2 Japanese food Hokkaidō Sapporo-shi Minami 3 Jōnishi 43.05682
## 3 Creative cuisine Tōkyō-to Shinjuku-ku Kabukichō 35.69384
## 4 Cafe/Sweets Tōkyō-to Bunkyō-ku Kasuga 35.70807
## 5 Izakaya Fukuoka-ken Fukuoka-shi Torikai 33.57569
## 6 Izakaya Tōkyō-to Shinjuku-ku Kabukichō 35.69384
## longitude day_of_week holiday_flg
## 1 130.3928 Friday 1
## 2 141.3540 Friday 1
## 3 139.7035 Friday 1
## 4 139.7522 Friday 1
## 5 130.3700 Friday 1
## 6 139.7035 Friday 1
tail(airBVIC)
## visit_date air_store_id bookingsTillLastWeek visitors
## 253348 2017-05-29 air_db4b38ebe7a7ceff 6 0
## 253349 2017-05-29 air_12c4fb7a423df20d 9 0
## 253350 2017-05-30 air_1033310359ceeac1 8 0
## 253351 2017-05-31 air_877f79706adbfb06 3 0
## 253352 2017-05-31 air_900d755ebd2f7bbd 10 0
## 253353 2017-05-31 air_3cad29d1a23209d2 3 0
## air_genre_name air_area_name latitude longitude
## 253348 Dining bar Ōsaka-fu Ōsaka-shi Shinmachi 34.67623 135.4861
## 253349 Western food Tōkyō-to Shibuya-ku Shibuya 35.66178 139.7041
## 253350 Izakaya Tōkyō-to Kōtō-ku Tomioka 35.67127 139.7970
## 253351 Japanese food Tōkyō-to Minato-ku Shibakōen 35.65807 139.7516
## 253352 Italian/French Tōkyō-to Chūō-ku Ginza 35.67211 139.7708
## 253353 Japanese food Tōkyō-to Shibuya-ku Shibuya 35.66178 139.7041
## day_of_week holiday_flg
## 253348 Monday 0
## 253349 Monday 0
## 253350 Tuesday 0
## 253351 Wednesday 0
## 253352 Wednesday 0
## 253353 Wednesday 0
dim(airBVIC)
## [1] 253353 10
Removing air_Area name as it is essentially giving us no more info than what location is giving us
airBVICwoAname=airBVIC[,-6]
head(airBVICwoAname)
## visit_date air_store_id bookingsTillLastWeek visitors
## 1 2016-01-01 air_05c325d315cc17f5 0 29
## 2 2016-01-01 air_f690c42545146e0a 0 7
## 3 2016-01-01 air_d0a1e69685259c92 0 7
## 4 2016-01-01 air_5b6d18c470bbfaf9 0 21
## 5 2016-01-01 air_60a7057184ec7ec7 0 64
## 6 2016-01-01 air_f26f36ec4dc5adb0 0 64
## air_genre_name latitude longitude day_of_week holiday_flg
## 1 Izakaya 33.58922 130.3928 Friday 1
## 2 Japanese food 43.05682 141.3540 Friday 1
## 3 Creative cuisine 35.69384 139.7035 Friday 1
## 4 Cafe/Sweets 35.70807 139.7522 Friday 1
## 5 Izakaya 33.57569 130.3700 Friday 1
## 6 Izakaya 35.69384 139.7035 Friday 1
The restaurant id info is itself very strong indicator if visitors will come or not, that if given, nothing else is more informative.
labels = airBVIC$air_store_id
d=airBVIC
do.pca <- function(dataset,lbls,
do.screeplot=F,do.scatter=F,do.biplot=F,do.loadingplot=F) {
data.pca = prcomp(dataset, scale=TRUE)
data.pc = predict(data.pca)
if (do.screeplot) plot(data.pca, main='screeplot for PCA')
if (do.scatter) {
plot(data.pc[,1:2], type="n")
text(x=data.pc[,1], y=data.pc[,2], labels=lbls)
}
if (do.biplot) biplot(data.pca)
if (do.loadingplot) {
plot(data.pca$rotation[,1],type='l')
# plot(data.pc[,1],type='l')
}
data.pc
}
d$visit_date=as.numeric(d$visit_date)
d$air_store_id = as.numeric(d$air_store_id)
d$air_genre_name = droplevels(d$air_genre_name)
head(d)
## visit_date air_store_id bookingsTillLastWeek visitors air_genre_name
## 1 16801 321 0 29 Izakaya
## 2 16801 302 0 7 Japanese food
## 3 16801 734 0 7 Creative cuisine
## 4 16801 478 0 21 Cafe/Sweets
## 5 16801 493 0 64 Izakaya
## 6 16801 806 0 64 Izakaya
## air_area_name latitude longitude day_of_week
## 1 Fukuoka-ken Fukuoka-shi Daimyō 33.58922 130.3928 Friday
## 2 Hokkaidō Sapporo-shi Minami 3 Jōnishi 43.05682 141.3540 Friday
## 3 Tōkyō-to Shinjuku-ku Kabukichō 35.69384 139.7035 Friday
## 4 Tōkyō-to Bunkyō-ku Kasuga 35.70807 139.7522 Friday
## 5 Fukuoka-ken Fukuoka-shi Torikai 33.57569 130.3700 Friday
## 6 Tōkyō-to Shinjuku-ku Kabukichō 35.69384 139.7035 Friday
## holiday_flg
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
head(airBVIC)
## visit_date air_store_id bookingsTillLastWeek visitors
## 1 2016-01-01 air_05c325d315cc17f5 0 29
## 2 2016-01-01 air_f690c42545146e0a 0 7
## 3 2016-01-01 air_d0a1e69685259c92 0 7
## 4 2016-01-01 air_5b6d18c470bbfaf9 0 21
## 5 2016-01-01 air_60a7057184ec7ec7 0 64
## 6 2016-01-01 air_f26f36ec4dc5adb0 0 64
## air_genre_name air_area_name latitude
## 1 Izakaya Fukuoka-ken Fukuoka-shi Daimyō 33.58922
## 2 Japanese food Hokkaidō Sapporo-shi Minami 3 Jōnishi 43.05682
## 3 Creative cuisine Tōkyō-to Shinjuku-ku Kabukichō 35.69384
## 4 Cafe/Sweets Tōkyō-to Bunkyō-ku Kasuga 35.70807
## 5 Izakaya Fukuoka-ken Fukuoka-shi Torikai 33.57569
## 6 Izakaya Tōkyō-to Shinjuku-ku Kabukichō 35.69384
## longitude day_of_week holiday_flg
## 1 130.3928 Friday 1
## 2 141.3540 Friday 1
## 3 139.7035 Friday 1
## 4 139.7522 Friday 1
## 5 130.3700 Friday 1
## 6 139.7035 Friday 1
d=as.data.frame(airBVIC[,-2])
head(d)
## visit_date bookingsTillLastWeek visitors air_genre_name
## 1 2016-01-01 0 29 Izakaya
## 2 2016-01-01 0 7 Japanese food
## 3 2016-01-01 0 7 Creative cuisine
## 4 2016-01-01 0 21 Cafe/Sweets
## 5 2016-01-01 0 64 Izakaya
## 6 2016-01-01 0 64 Izakaya
## air_area_name latitude longitude day_of_week
## 1 Fukuoka-ken Fukuoka-shi Daimyō 33.58922 130.3928 Friday
## 2 Hokkaidō Sapporo-shi Minami 3 Jōnishi 43.05682 141.3540 Friday
## 3 Tōkyō-to Shinjuku-ku Kabukichō 35.69384 139.7035 Friday
## 4 Tōkyō-to Bunkyō-ku Kasuga 35.70807 139.7522 Friday
## 5 Fukuoka-ken Fukuoka-shi Torikai 33.57569 130.3700 Friday
## 6 Tōkyō-to Shinjuku-ku Kabukichō 35.69384 139.7035 Friday
## holiday_flg
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
To apply classification techs, we need to devide o/p in classes
airBVICC=airBVIC
airBVICC$visitors[airBVICC$visitors<17]=0
airBVICC$visitors[airBVICC$visitors>=17]=1
airBVICC$visitors=as.factor(airBVICC$visitors)
head(airBVICC)
## visit_date air_store_id bookingsTillLastWeek visitors
## 1 2016-01-01 air_05c325d315cc17f5 0 1
## 2 2016-01-01 air_f690c42545146e0a 0 0
## 3 2016-01-01 air_d0a1e69685259c92 0 0
## 4 2016-01-01 air_5b6d18c470bbfaf9 0 1
## 5 2016-01-01 air_60a7057184ec7ec7 0 1
## 6 2016-01-01 air_f26f36ec4dc5adb0 0 1
## air_genre_name air_area_name latitude
## 1 Izakaya Fukuoka-ken Fukuoka-shi Daimyō 33.58922
## 2 Japanese food Hokkaidō Sapporo-shi Minami 3 Jōnishi 43.05682
## 3 Creative cuisine Tōkyō-to Shinjuku-ku Kabukichō 35.69384
## 4 Cafe/Sweets Tōkyō-to Bunkyō-ku Kasuga 35.70807
## 5 Izakaya Fukuoka-ken Fukuoka-shi Torikai 33.57569
## 6 Izakaya Tōkyō-to Shinjuku-ku Kabukichō 35.69384
## longitude day_of_week holiday_flg
## 1 130.3928 Friday 1
## 2 141.3540 Friday 1
## 3 139.7035 Friday 1
## 4 139.7522 Friday 1
## 5 130.3700 Friday 1
## 6 139.7035 Friday 1
grouping and changing types of some predictors.
Date was needed to combine the data. but date itself doesn’t give much information. The information it can give which is day, holiday, weather, etc. Most of these variables we have in our dataset. But many dates will make it difficult for us to classify. To keep track of weather, the month could also give sufficient information as date, but reduces the computation.So we are converting the date to month after all tables are merged. Surely, it will reduce the accuracy if we had lot of data, but we don’t have very large data to leverage the date information anyway.
air_area_name is not useful as it is giving no more info than waht’s given by location cordinates. It was only useful to match weather and main data.
airc=airBVICC
airc$visit_date=(strftime(airc$visit_date, "%m"))
airc$visit_date=as.numeric(airc$visit_date)
airc$air_store_id = as.numeric(airc$air_store_id)
airc$air_genre_name = droplevels(airc$air_genre_name)
summary(airc)
## visit_date air_store_id bookingsTillLastWeek visitors
## Min. : 1.000 Min. : 1.0 Min. : 0.0000 0:124016
## 1st Qu.: 3.000 1st Qu.:200.0 1st Qu.: 0.0000 1:129337
## Median : 7.000 Median :409.0 Median : 0.0000
## Mean : 6.203 Mean :410.2 Mean : 0.6388
## 3rd Qu.:10.000 3rd Qu.:618.0 3rd Qu.: 0.0000
## Max. :12.000 Max. :829.0 Max. :2232.0000
##
## air_genre_name air_area_name
## Izakaya :62417 Fukuoka-ken Fukuoka-shi Daimyō: 19861
## Cafe/Sweets :52843 Tōkyō-to Shibuya-ku Shibuya : 17456
## Dining bar :34319 Tōkyō-to Minato-ku Shibakōen : 14747
## Italian/French:30280 Tōkyō-to Shinjuku-ku Kabukichō: 12548
## Bar/Cocktail :25161 Tōkyō-to Setagaya-ku Setagaya : 8722
## Japanese food :18910 Tōkyō-to Chūō-ku Tsukiji : 8396
## (Other) :29423 (Other) :171623
## latitude longitude day_of_week holiday_flg
## Min. :33.21 Min. :130.2 Friday :40590 Min. :0.00000
## 1st Qu.:34.69 1st Qu.:135.3 Monday :31801 1st Qu.:0.00000
## Median :35.66 Median :139.7 Saturday :39489 Median :0.00000
## Mean :35.62 Mean :137.4 Sunday :30135 Mean :0.05133
## 3rd Qu.:35.69 3rd Qu.:139.8 Thursday :38171 3rd Qu.:0.00000
## Max. :44.02 Max. :144.3 Tuesday :36177 Max. :1.00000
## Wednesday:36990
airc=airc[,-6]
airc$y=airc$visitors
airc=airc[,-4]
dim(airc)
## [1] 253353 9
head(airc)
## visit_date air_store_id bookingsTillLastWeek air_genre_name latitude
## 1 1 321 0 Izakaya 33.58922
## 2 1 302 0 Japanese food 43.05682
## 3 1 734 0 Creative cuisine 35.69384
## 4 1 478 0 Cafe/Sweets 35.70807
## 5 1 493 0 Izakaya 33.57569
## 6 1 806 0 Izakaya 35.69384
## longitude day_of_week holiday_flg y
## 1 130.3928 Friday 1 1
## 2 141.3540 Friday 1 0
## 3 139.7035 Friday 1 0
## 4 139.7522 Friday 1 1
## 5 130.3700 Friday 1 1
## 6 139.7035 Friday 1 1
aira=airc
head(aira)
## visit_date air_store_id bookingsTillLastWeek air_genre_name latitude
## 1 1 321 0 Izakaya 33.58922
## 2 1 302 0 Japanese food 43.05682
## 3 1 734 0 Creative cuisine 35.69384
## 4 1 478 0 Cafe/Sweets 35.70807
## 5 1 493 0 Izakaya 33.57569
## 6 1 806 0 Izakaya 35.69384
## longitude day_of_week holiday_flg y
## 1 130.3928 Friday 1 1
## 2 141.3540 Friday 1 0
## 3 139.7035 Friday 1 0
## 4 139.7522 Friday 1 1
## 5 130.3700 Friday 1 1
## 6 139.7035 Friday 1 1
aira= model.matrix(~.,data=aira)
head(aira)
## (Intercept) visit_date air_store_id bookingsTillLastWeek
## 1 1 1 321 0
## 2 1 1 302 0
## 3 1 1 734 0
## 4 1 1 478 0
## 5 1 1 493 0
## 6 1 1 806 0
## air_genre_nameCafe/Sweets air_genre_nameCreative cuisine
## 1 0 0
## 2 0 0
## 3 0 1
## 4 1 0
## 5 0 0
## 6 0 0
## air_genre_nameDining bar air_genre_nameItalian/French
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## air_genre_nameIzakaya air_genre_nameJapanese food
## 1 1 0
## 2 0 1
## 3 0 0
## 4 0 0
## 5 1 0
## 6 1 0
## air_genre_nameOkonomiyaki/Monja/Teppanyaki air_genre_nameOther
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## air_genre_nameWestern food air_genre_nameYakiniku/Korean food latitude
## 1 0 0 33.58922
## 2 0 0 43.05682
## 3 0 0 35.69384
## 4 0 0 35.70807
## 5 0 0 33.57569
## 6 0 0 35.69384
## longitude day_of_weekMonday day_of_weekSaturday day_of_weekSunday
## 1 130.3928 0 0 0
## 2 141.3540 0 0 0
## 3 139.7035 0 0 0
## 4 139.7522 0 0 0
## 5 130.3700 0 0 0
## 6 139.7035 0 0 0
## day_of_weekThursday day_of_weekTuesday day_of_weekWednesday holiday_flg
## 1 0 0 0 1
## 2 0 0 0 1
## 3 0 0 0 1
## 4 0 0 0 1
## 5 0 0 0 1
## 6 0 0 0 1
## y1
## 1 1
## 2 0
## 3 0
## 4 1
## 5 1
## 6 1
aira<-data.frame(aira)
colnames(aira)[which(names(aira) == "y1")] <- "y"
aira$y=as.numeric(aira$y)
aira=as.data.frame(aira[,-1])
dim(aira)
## [1] 253353 23
head(aira)
## visit_date air_store_id bookingsTillLastWeek air_genre_nameCafe.Sweets
## 1 1 321 0 0
## 2 1 302 0 0
## 3 1 734 0 0
## 4 1 478 0 1
## 5 1 493 0 0
## 6 1 806 0 0
## air_genre_nameCreative.cuisine air_genre_nameDining.bar
## 1 0 0
## 2 0 0
## 3 1 0
## 4 0 0
## 5 0 0
## 6 0 0
## air_genre_nameItalian.French air_genre_nameIzakaya
## 1 0 1
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 1
## 6 0 1
## air_genre_nameJapanese.food air_genre_nameOkonomiyaki.Monja.Teppanyaki
## 1 0 0
## 2 1 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## air_genre_nameOther air_genre_nameWestern.food
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## air_genre_nameYakiniku.Korean.food latitude longitude day_of_weekMonday
## 1 0 33.58922 130.3928 0
## 2 0 43.05682 141.3540 0
## 3 0 35.69384 139.7035 0
## 4 0 35.70807 139.7522 0
## 5 0 33.57569 130.3700 0
## 6 0 35.69384 139.7035 0
## day_of_weekSaturday day_of_weekSunday day_of_weekThursday
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## day_of_weekTuesday day_of_weekWednesday holiday_flg y
## 1 0 0 1 1
## 2 0 0 1 0
## 3 0 0 1 0
## 4 0 0 1 1
## 5 0 0 1 1
## 6 0 0 1 1
head(aira)
## visit_date air_store_id bookingsTillLastWeek air_genre_nameCafe.Sweets
## 1 1 321 0 0
## 2 1 302 0 0
## 3 1 734 0 0
## 4 1 478 0 1
## 5 1 493 0 0
## 6 1 806 0 0
## air_genre_nameCreative.cuisine air_genre_nameDining.bar
## 1 0 0
## 2 0 0
## 3 1 0
## 4 0 0
## 5 0 0
## 6 0 0
## air_genre_nameItalian.French air_genre_nameIzakaya
## 1 0 1
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 1
## 6 0 1
## air_genre_nameJapanese.food air_genre_nameOkonomiyaki.Monja.Teppanyaki
## 1 0 0
## 2 1 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## air_genre_nameOther air_genre_nameWestern.food
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## air_genre_nameYakiniku.Korean.food latitude longitude day_of_weekMonday
## 1 0 33.58922 130.3928 0
## 2 0 43.05682 141.3540 0
## 3 0 35.69384 139.7035 0
## 4 0 35.70807 139.7522 0
## 5 0 33.57569 130.3700 0
## 6 0 35.69384 139.7035 0
## day_of_weekSaturday day_of_weekSunday day_of_weekThursday
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## day_of_weekTuesday day_of_weekWednesday holiday_flg y
## 1 0 0 1 1
## 2 0 0 1 0
## 3 0 0 1 0
## 4 0 0 1 1
## 5 0 0 1 1
## 6 0 0 1 1
dim(aira)
## [1] 253353 23
summary(aira[1:30000,])
## visit_date air_store_id bookingsTillLastWeek
## Min. :1.000 Min. : 4.0 Min. : 0.0000
## 1st Qu.:2.000 1st Qu.:184.0 1st Qu.: 0.0000
## Median :3.000 Median :408.0 Median : 0.0000
## Mean :2.485 Mean :404.3 Mean : 0.5126
## 3rd Qu.:3.000 3rd Qu.:608.0 3rd Qu.: 0.0000
## Max. :4.000 Max. :826.0 Max. :79.0000
## air_genre_nameCafe.Sweets air_genre_nameCreative.cuisine
## Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.00000
## Mean :0.2006 Mean :0.01327
## 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000
## air_genre_nameDining.bar air_genre_nameItalian.French
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000
## Mean :0.1266 Mean :0.1158
## 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000
## air_genre_nameIzakaya air_genre_nameJapanese.food
## Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.000
## Median :0.0000 Median :0.000
## Mean :0.2695 Mean :0.079
## 3rd Qu.:1.0000 3rd Qu.:0.000
## Max. :1.0000 Max. :1.000
## air_genre_nameOkonomiyaki.Monja.Teppanyaki air_genre_nameOther
## Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.00000
## Mean :0.0134 Mean :0.03583
## 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000
## air_genre_nameWestern.food air_genre_nameYakiniku.Korean.food
## Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000
## Mean :0.01703 Mean :0.0243
## 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.0000
## latitude longitude day_of_weekMonday day_of_weekSaturday
## Min. :33.21 Min. :130.2 Min. :0.0000 Min. :0.0000
## 1st Qu.:34.68 1st Qu.:135.3 1st Qu.:0.0000 1st Qu.:0.0000
## Median :35.66 Median :139.7 Median :0.0000 Median :0.0000
## Mean :35.51 Mean :137.2 Mean :0.1281 Mean :0.1558
## 3rd Qu.:35.69 3rd Qu.:139.8 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :44.02 Max. :144.3 Max. :1.0000 Max. :1.0000
## day_of_weekSunday day_of_weekThursday day_of_weekTuesday
## Min. :0.0000 Min. :0.000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.000
## Median :0.0000 Median :0.000 Median :0.000
## Mean :0.1192 Mean :0.148 Mean :0.146
## 3rd Qu.:0.0000 3rd Qu.:0.000 3rd Qu.:0.000
## Max. :1.0000 Max. :1.000 Max. :1.000
## day_of_weekWednesday holiday_flg y
## Min. :0.000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.000 Median :0.00000 Median :1.0000
## Mean :0.145 Mean :0.03673 Mean :0.5327
## 3rd Qu.:0.000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.000 Max. :1.00000 Max. :1.0000
airs=airc
colnames(airs)[which(names(airs) == "visitors")] <- "y"
head(airs)
## visit_date air_store_id bookingsTillLastWeek air_genre_name latitude
## 1 1 321 0 Izakaya 33.58922
## 2 1 302 0 Japanese food 43.05682
## 3 1 734 0 Creative cuisine 35.69384
## 4 1 478 0 Cafe/Sweets 35.70807
## 5 1 493 0 Izakaya 33.57569
## 6 1 806 0 Izakaya 35.69384
## longitude day_of_week holiday_flg y
## 1 130.3928 Friday 1 1
## 2 141.3540 Friday 1 0
## 3 139.7035 Friday 1 0
## 4 139.7522 Friday 1 1
## 5 130.3700 Friday 1 1
## 6 139.7035 Friday 1 1
airs$y=as.numeric(airs$y)
#res=my.classifier(aira[1:50000,], cl.name='knn',do.cv=F)
we need to convert categorical predictors into numeric predictors for KNN.
#summary(airBVICC$bookingsTillLastWeek)
airBVICCN=airBVICC[,c("visit_date", "air_store_id", "bookingsTillLastWeek","air_genre_name", "air_area_name", "latitude", "longitude", "day_of_week", "holiday_flg", "visitors")]
airBVICCN$visit_date = as.numeric(airBVICCN$visit_date)
airBVICCN$air_store_id = as.numeric(airBVICCN$air_store_id)
airBVICCN$air_genre_name = as.numeric(airBVICCN$air_genre_name)
airBVICCN$air_area_name = as.numeric(airBVICCN$air_area_name)
airBVICCN$day_of_week = as.numeric(airBVICCN$day_of_week)
head(airBVICCN)
## visit_date air_store_id bookingsTillLastWeek air_genre_name
## 1 16801 321 0 6
## 2 16801 302 0 7
## 3 16801 734 0 3
## 4 16801 478 0 2
## 5 16801 493 0 6
## 6 16801 806 0 6
## air_area_name latitude longitude day_of_week holiday_flg visitors
## 1 1 33.58922 130.3928 1 1 1
## 2 24 43.05682 141.3540 1 1 0
## 3 92 35.69384 139.7035 1 1 0
## 4 54 35.70807 139.7522 1 1 1
## 5 8 33.57569 130.3700 1 1 1
## 6 92 35.69384 139.7035 1 1 1
summary(airBVICCN)
## visit_date air_store_id bookingsTillLastWeek air_genre_name
## Min. :16801 Min. : 1.0 Min. : 0.0000 Min. : 1.000
## 1st Qu.:17005 1st Qu.:200.0 1st Qu.: 0.0000 1st Qu.: 2.000
## Median :17098 Median :409.0 Median : 0.0000 Median : 5.000
## Mean :17087 Mean :410.2 Mean : 0.6388 Mean : 4.671
## 3rd Qu.:17191 3rd Qu.:618.0 3rd Qu.: 0.0000 3rd Qu.: 6.000
## Max. :17317 Max. :829.0 Max. :2232.0000 Max. :11.000
## air_area_name latitude longitude day_of_week
## Min. : 1.00 Min. :33.21 Min. :130.2 Min. :1.000
## 1st Qu.: 25.00 1st Qu.:34.69 1st Qu.:135.3 1st Qu.:2.000
## Median : 57.00 Median :35.66 Median :139.7 Median :4.000
## Mean : 52.94 Mean :35.62 Mean :137.4 Mean :3.987
## 3rd Qu.: 85.00 3rd Qu.:35.69 3rd Qu.:139.8 3rd Qu.:6.000
## Max. :103.00 Max. :44.02 Max. :144.3 Max. :7.000
## holiday_flg visitors
## Min. :0.00000 0:124016
## 1st Qu.:0.00000 1:129337
## Median :0.00000
## Mean :0.05133
## 3rd Qu.:0.00000
## Max. :1.00000
## standardize/normalize data
airs=airBVICCN
airs[1:9] <- as.data.frame(scale(airBVICCN[,1:9]))
summary(airs)
## visit_date air_store_id bookingsTillLastWeek
## Min. :-2.31847 Min. :-1.704132 Min. : -0.08116
## 1st Qu.:-0.66690 1st Qu.:-0.875300 1st Qu.: -0.08116
## Median : 0.08603 Median :-0.004817 Median : -0.08116
## Mean : 0.00000 Mean : 0.000000 Mean : 0.00000
## 3rd Qu.: 0.83895 3rd Qu.: 0.865665 3rd Qu.: -0.08116
## Max. : 1.85903 Max. : 1.744478 Max. :283.46359
## air_genre_name air_area_name latitude longitude
## Min. :-1.4568 Min. :-1.6160 Min. :-1.17336 Min. :-1.9507
## 1st Qu.:-1.0600 1st Qu.:-0.8693 1st Qu.:-0.45101 1st Qu.:-0.5700
## Median : 0.1304 Median : 0.1263 Median : 0.02023 Median : 0.6296
## Mean : 0.0000 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.5272 3rd Qu.: 0.9975 3rd Qu.: 0.03776 3rd Qu.: 0.6518
## Max. : 2.5112 Max. : 1.5575 Max. : 4.10077 Max. : 1.8833
## day_of_week holiday_flg visitors
## Min. :-1.468681 Min. :-0.2326 0:124016
## 1st Qu.:-0.976943 1st Qu.:-0.2326 1:129337
## Median : 0.006533 Median :-0.2326
## Mean : 0.000000 Mean : 0.0000
## 3rd Qu.: 0.990009 3rd Qu.:-0.2326
## Max. : 1.481748 Max. : 4.2991
set.seed(12345) # set the seed so you can get exactly the same results whenever you run the code
do.classification <- function(train.set, test.set,
cl.name, verbose=F) {
## note: to plot ROC later, we want the raw probabilities,
## not binary decisions
switch(cl.name,
knn = { # here we test k=1; you should evaluate different k's
prob = knn(train.set[,-1], test.set[,-1], cl=train.set[,1], k = 1, prob=T)
attr(prob,"prob")[prob==0] = 1-attr(prob,"prob")[prob==0] #modified
prob = attr(prob,"prob")
#print(cbind(prob,as.character(test.set$y)))
prob
},
knn3 = {
prob = knn(train.set[,-1], test.set[,-1], cl=train.set[,1], k = 3, prob=T)
attr(prob,"prob")[prob==0] = 1-attr(prob,"prob")[prob==0] #modified
prob = attr(prob,"prob")
#print(cbind(prob,as.character(test.set$y)))
prob
},
knn5 = {
prob = knn(train.set[,-1], test.set[,-1], cl=train.set[,1], k = 5, prob=T)
attr(prob,"prob")[prob==0] = 1-attr(prob,"prob")[prob==0] #modified
prob = attr(prob,"prob")
#print(cbind(prob,as.character(test.set$y)))
prob
},
knn10 = {
prob = knn(train.set[,-1], test.set[,-1], cl=train.set[,1], k = 10, prob=T)
attr(prob,"prob")[prob==0] = 1-attr(prob,"prob")[prob==0] #modified
prob = attr(prob,"prob")
#print(cbind(prob,as.character(test.set$y)))
prob
},
lr = { # logistic regression
model = glm(y~., family=binomial(link="logit"), data=train.set)
if (verbose) {
print(summary(model))
}
prob = predict(model, newdata=test.set, type="response")
#print(cbind(prob,as.character(test.set$y)))
prob
},
nb = { # naive bayes
model = naiveBayes(y~., data=train.set)
prob = predict(model, newdata=test.set, type="raw")
#print(cbind(prob,as.character(test.set$y)))
prob = prob[,2]/rowSums(prob) # renormalize the prob.
prob
},
dtree = {
model = rpart(y~., data=train.set, method="anova")
prob = predict(model, newdata=test.set)
},
dtree2 = {
model = tree(y~.,data=as.data.frame(train.set),mindev=0.02, mincut=3)
prob = predict(model, newdata=test.set)
},
dtree3 = {
model = dtree(y~.,data=as.data.frame(train.set),methods=c("lm"), tuneLength=3)
prob = predict(model, newdata=as.data.frame(test.set))
plot(model)
text(model,digits=2)
# printcp(model)
# plot(model, uniform=TRUE, main="Classification Tree")
# text(model, use.n=TRUE, all=TRUE, cex=.8)
},
dtreeprune = {
#model = rpart(y~., data=train.set)
model <- rpart(y~., data=train.set,control =rpart.control(minsplit=10,minbucket=3, cp=0.001,maxdepth=10))
prob = predict(model, newdata=test.set)
if (1) { # here we prune the tree
## prune the tree
pfit<- prune(model, cp=model$cptable[which.min(model$cptable[,"xerror"]),"CP"])
prob = predict(pfit, newdata=test.set)
## plot the pruned tree
plot(pfit, uniform=TRUE,main="Pruned Classification Tree")
text(pfit, use.n=TRUE, all=TRUE, cex=.8)
}
#print(cbind(prob,as.character(test.set$y)))
# renormalize the prob.
prob
},
ada = {
model = ada(y~., data = train.set)
prob = predict(model, newdata=test.set, type='probs')
#print(cbind(prob,as.character(test.set$y)))
prob = prob[,2]/rowSums(prob)
prob
}
)
}
To do time series cross validation
timeseries.cv <- function(dataset, cl.name, t.fold=16, get.performance=T,prob.cutoff=0.5, val=TRUE) {
n=t.fold-3
n.obs <- nrow(dataset) # no. of observations
ss=n.obs/t.fold
errors = dim(t.fold)
precisions = dim(t.fold)
recalls = dim(t.fold)
fscores = dim(t.fold)
accuracies = dim(t.fold)
probs = NULL
actuals = NULL
print(cl.name)
for (t in 1:n) {
ltrain=(t-1)*ss+1
utrain=((t+1)*ss)-1
lval=(t+1)*ss
uval=((t+2)*ss)-1
ltest=(t+2)*ss
utest=(t+3)*ss-1
train.set = dataset[ltrain:utrain,]
val.set=dataset[lval:uval,]
test.set = dataset[ltest:utest,]
if(val==FALSE)
{
val.set= test.set
}
#cat(t.fold,'-fold CV run',t,cl.name,':', '#training:',nrow(train.set),'#val:',nrow(val.set),'#testing',nrow(test.set),'\n')
prob = do.classification(train.set, val.set, cl.name)
predicted = as.numeric(prob > prob.cutoff)
actual = val.set$y
predicted=factor(predicted,levels=c(0,1))
confusion.matrix = table(actual,predicted)
confusion.matrix
error = (confusion.matrix[1,2]+confusion.matrix[2,1]) / nrow(test.set)
errors[t] = error
#cat('\t\terror=',error,'\n')
precision = (confusion.matrix[2,2]/(confusion.matrix[2,2]+confusion.matrix[1,2]+0.0001)) # so that denominator does't become 0
precisions[t] = precision
#print(confusion.matrix)
recall =(confusion.matrix[2,2]/(confusion.matrix[2,2]+confusion.matrix[2,1]+0.0001))
recalls[t] = recall
fscore = 2*precision*recall/(precision+recall+0.000001)
fscores[t] = fscore
probs = c(probs,prob)
actuals = c(actuals,actual)
}
avg.error = mean(errors)
cat('avg error=',avg.error,'\n')
avg.accuracy = 1 - avg.error
cat('avg Accuracy=',avg.accuracy,'\n')
avg.precision = mean(precisions)
cat('avg Precision=',avg.precision,'\n')
avg.recall = mean(recalls)
cat('avg recall=',avg.recall,'\n')
avg.fscore = mean(fscores)
cat('avg fscore=',avg.fscore,'\n')
}
#
# # (1)-1 evaluate the 10-fold cv results in a performance table
# colnames(airs)[which(names(airs) == "visitors")] <- "y"
# head(airs)
#
# results1 <- cbind(my.classifier(airc, cl.name='lr',do.cv=F),
# my.classifier(airs, cl.name='knn',do.cv=F), # use dataset_num for kNN
# my.classifier(airc, cl.name='nb',do.cv=F),
# my.classifier(airc, cl.name='dtree',do.cv=F),
# my.classifier(airc, cl.name='svm',do.cv=F),
# my.classifier(airc, cl.name='ada',do.cv=F)
# )
head(aira)
## visit_date air_store_id bookingsTillLastWeek air_genre_nameCafe.Sweets
## 1 1 321 0 0
## 2 1 302 0 0
## 3 1 734 0 0
## 4 1 478 0 1
## 5 1 493 0 0
## 6 1 806 0 0
## air_genre_nameCreative.cuisine air_genre_nameDining.bar
## 1 0 0
## 2 0 0
## 3 1 0
## 4 0 0
## 5 0 0
## 6 0 0
## air_genre_nameItalian.French air_genre_nameIzakaya
## 1 0 1
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 1
## 6 0 1
## air_genre_nameJapanese.food air_genre_nameOkonomiyaki.Monja.Teppanyaki
## 1 0 0
## 2 1 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## air_genre_nameOther air_genre_nameWestern.food
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## air_genre_nameYakiniku.Korean.food latitude longitude day_of_weekMonday
## 1 0 33.58922 130.3928 0
## 2 0 43.05682 141.3540 0
## 3 0 35.69384 139.7035 0
## 4 0 35.70807 139.7522 0
## 5 0 33.57569 130.3700 0
## 6 0 35.69384 139.7035 0
## day_of_weekSaturday day_of_weekSunday day_of_weekThursday
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## day_of_weekTuesday day_of_weekWednesday holiday_flg y
## 1 0 0 1 1
## 2 0 0 1 0
## 3 0 0 1 0
## 4 0 0 1 1
## 5 0 0 1 1
## 6 0 0 1 1
timeseries.cv(aira, cl.name='dtree2', t.fold=16, get.performance=T, val=TRUE)
## [1] "dtree2"
## avg error= 0.5035221
## avg Accuracy= 0.4964779
## avg Precision= 0.4248866
## avg recall= 0.4803596
## avg fscore= 0.3406344
So, our results are very poor when we used all the variables. As bas as a random guess.
airarid=aira[,-2]
head(airarid)
## visit_date bookingsTillLastWeek air_genre_nameCafe.Sweets
## 1 1 0 0
## 2 1 0 0
## 3 1 0 0
## 4 1 0 1
## 5 1 0 0
## 6 1 0 0
## air_genre_nameCreative.cuisine air_genre_nameDining.bar
## 1 0 0
## 2 0 0
## 3 1 0
## 4 0 0
## 5 0 0
## 6 0 0
## air_genre_nameItalian.French air_genre_nameIzakaya
## 1 0 1
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 1
## 6 0 1
## air_genre_nameJapanese.food air_genre_nameOkonomiyaki.Monja.Teppanyaki
## 1 0 0
## 2 1 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## air_genre_nameOther air_genre_nameWestern.food
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## air_genre_nameYakiniku.Korean.food latitude longitude day_of_weekMonday
## 1 0 33.58922 130.3928 0
## 2 0 43.05682 141.3540 0
## 3 0 35.69384 139.7035 0
## 4 0 35.70807 139.7522 0
## 5 0 33.57569 130.3700 0
## 6 0 35.69384 139.7035 0
## day_of_weekSaturday day_of_weekSunday day_of_weekThursday
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## day_of_weekTuesday day_of_weekWednesday holiday_flg y
## 1 0 0 1 1
## 2 0 0 1 0
## 3 0 0 1 0
## 4 0 0 1 1
## 5 0 0 1 1
## 6 0 0 1 1
timeseries.cv(airarid, cl.name='dtree2', t.fold=16, get.performance=T)
## [1] "dtree2"
## avg error= 0.5035221
## avg Accuracy= 0.4964779
## avg Precision= 0.4248866
## avg recall= 0.4803596
## avg fscore= 0.3406344
results are bad even if we remove the store ids. As bas as a random guess.
head(aira)
## visit_date air_store_id bookingsTillLastWeek air_genre_nameCafe.Sweets
## 1 1 321 0 0
## 2 1 302 0 0
## 3 1 734 0 0
## 4 1 478 0 1
## 5 1 493 0 0
## 6 1 806 0 0
## air_genre_nameCreative.cuisine air_genre_nameDining.bar
## 1 0 0
## 2 0 0
## 3 1 0
## 4 0 0
## 5 0 0
## 6 0 0
## air_genre_nameItalian.French air_genre_nameIzakaya
## 1 0 1
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 1
## 6 0 1
## air_genre_nameJapanese.food air_genre_nameOkonomiyaki.Monja.Teppanyaki
## 1 0 0
## 2 1 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## air_genre_nameOther air_genre_nameWestern.food
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## air_genre_nameYakiniku.Korean.food latitude longitude day_of_weekMonday
## 1 0 33.58922 130.3928 0
## 2 0 43.05682 141.3540 0
## 3 0 35.69384 139.7035 0
## 4 0 35.70807 139.7522 0
## 5 0 33.57569 130.3700 0
## 6 0 35.69384 139.7035 0
## day_of_weekSaturday day_of_weekSunday day_of_weekThursday
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## day_of_weekTuesday day_of_weekWednesday holiday_flg y
## 1 0 0 1 1
## 2 0 0 1 0
## 3 0 0 1 0
## 4 0 0 1 1
## 5 0 0 1 1
## 6 0 0 1 1
timeseries.cv(aira[-1], cl.name='dtree2', t.fold=16, get.performance=T)
## [1] "dtree2"
## avg error= 0.5035221
## avg Accuracy= 0.4964779
## avg Precision= 0.4248866
## avg recall= 0.4803596
## avg fscore= 0.3406344
Results are still bad if we remove date information. As bas as a random guess.
head(aira)
## visit_date air_store_id bookingsTillLastWeek air_genre_nameCafe.Sweets
## 1 1 321 0 0
## 2 1 302 0 0
## 3 1 734 0 0
## 4 1 478 0 1
## 5 1 493 0 0
## 6 1 806 0 0
## air_genre_nameCreative.cuisine air_genre_nameDining.bar
## 1 0 0
## 2 0 0
## 3 1 0
## 4 0 0
## 5 0 0
## 6 0 0
## air_genre_nameItalian.French air_genre_nameIzakaya
## 1 0 1
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 1
## 6 0 1
## air_genre_nameJapanese.food air_genre_nameOkonomiyaki.Monja.Teppanyaki
## 1 0 0
## 2 1 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## air_genre_nameOther air_genre_nameWestern.food
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## air_genre_nameYakiniku.Korean.food latitude longitude day_of_weekMonday
## 1 0 33.58922 130.3928 0
## 2 0 43.05682 141.3540 0
## 3 0 35.69384 139.7035 0
## 4 0 35.70807 139.7522 0
## 5 0 33.57569 130.3700 0
## 6 0 35.69384 139.7035 0
## day_of_weekSaturday day_of_weekSunday day_of_weekThursday
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## day_of_weekTuesday day_of_weekWednesday holiday_flg y
## 1 0 0 1 1
## 2 0 0 1 0
## 3 0 0 1 0
## 4 0 0 1 1
## 5 0 0 1 1
## 6 0 0 1 1
typeof(aira)
## [1] "list"
data = subset(aira[, c("visit_date", "y")])
timeseries.cv(data, cl.name='dtree2', t.fold=16, get.performance=T)
## [1] "dtree2"
## avg error= 0.5048192
## avg Accuracy= 0.4951808
## avg Precision= 0.3097959
## avg recall= 0.6153846
## avg fscore= 0.4118524
It’s still pretty bad. As bas as a random guess.
all.classifiers <- function(dataset) {
timeseries.cv(dataset, cl.name='dtree2', t.fold=16, get.performance=T)
timeseries.cv(dataset, cl.name='dtree', t.fold=16, get.performance=T)
timeseries.cv(dataset, cl.name='lr', t.fold=16, get.performance=T)
timeseries.cv(dataset, cl.name='nb', t.fold=16, get.performance=T)
}
# results1 <- cbind(timeseries.cv(aira, cl.name='dtree2', t.fold=16, get.performance=T) ,
# my.classifier(airs, cl.name='knn',t.fold=16, do.cv=T),
# timeseries.cv(aira, cl.name='dtree', t.fold=16, get.performance=T) ,
# timeseries.cv(aira, cl.name='lr', t.fold=16, get.performance=T) ,
# timeseries.cv(aira, cl.name='nb', t.fold=16, get.performance=T) )
airss=airs
colnames(airss)[which(names(airss) == "visitors")] <- "y"
head(airss)
## visit_date air_store_id bookingsTillLastWeek air_genre_name
## 1 -2.318467 -0.3713360 -0.08115527 0.5272422
## 2 -2.318467 -0.4504708 -0.08115527 0.9240411
## 3 -2.318467 1.3488041 -0.08115527 -0.6631546
## 4 -2.318467 0.2825671 -0.08115527 -1.0599535
## 5 -2.318467 0.3450419 -0.08115527 0.5272422
## 6 -2.318467 1.6486832 -0.08115527 0.5272422
## air_area_name latitude longitude day_of_week holiday_flg y
## 1 -1.61600084 -0.98927786 -1.897016 -1.468681 4.299142 1
## 2 -0.90040186 3.63047062 1.088242 -1.468681 4.299142 0
## 3 1.21528207 0.03768064 0.638755 -1.468681 4.299142 0
## 4 0.03298811 0.04462304 0.651996 -1.468681 4.299142 1
## 5 -1.39820985 -0.99587958 -1.903236 -1.468681 4.299142 1
## 6 1.21528207 0.03768064 0.638755 -1.468681 4.299142 1
#all.classifiers(aira)
#
# timeseries.cv(aira, cl.name='dtreeprune', t.fold=32, get.performance=T)
# timeseries.cv(airss, cl.name='dtreeprune', t.fold=16, get.performance=T)
# model <- rpart(y~., data=aira[1:10000,],control =rpart.control(minsplit=10,minbucket=3, cp=0,maxdepth=10))
# prob = predict(model, newdata=aira[10000:15000,])
#
# model$cptable
#
# pfit<- prune(model, cp=0.01)
# plot(model)
# x=which.min(model$cptable[,"xerror"])
# print(x)
# x=which.min(model$cptable[,"xerror"],"CP")
#head(airs)
#do.classification(airc[1:10000,], aira[10000:15000,], 'nb')
Maybe because we are trying to use one model fit all stategy on all the restaurants, results are coming really bad. As we saw every restaurnt is very significant to the results before, let us try single model for different restaurnt
Taking only one restaurant information, only it’s date as in 365 days
ndf <- a_visited[a_visited$air_store_id == "air_36bcf77d3382d36e",]
head(ndf)
## air_store_id visit_date visitors
## 204402 air_36bcf77d3382d36e 2016-01-01 34
## 204403 air_36bcf77d3382d36e 2016-01-02 43
## 204404 air_36bcf77d3382d36e 2016-01-03 30
## 204405 air_36bcf77d3382d36e 2016-01-05 38
## 204406 air_36bcf77d3382d36e 2016-01-06 22
## 204407 air_36bcf77d3382d36e 2016-01-07 18
summary(ndf)
## air_store_id visit_date visitors
## air_36bcf77d3382d36e:476 Min. :2016-01-01 Min. : 6.00
## air_00a91d42b08b08d9: 0 1st Qu.:2016-04-29 1st Qu.:18.00
## air_0164b9927d20bcc3: 0 Median :2016-08-26 Median :24.00
## air_0241aa3964b7f861: 0 Mean :2016-08-26 Mean :31.78
## air_0328696196e46f18: 0 3rd Qu.:2016-12-23 3rd Qu.:45.50
## air_034a3d5b40d5b1b1: 0 Max. :2017-04-22 Max. :99.00
## (Other) : 0
ndf=ndf[,-1]
head(ndf)
## visit_date visitors
## 204402 2016-01-01 34
## 204403 2016-01-02 43
## 204404 2016-01-03 30
## 204405 2016-01-05 38
## 204406 2016-01-06 22
## 204407 2016-01-07 18
ndf$visit_date=as.numeric(ndf$visit_date)%%365
head(ndf)
## visit_date visitors
## 204402 11 34
## 204403 12 43
## 204404 13 30
## 204405 15 38
## 204406 16 22
## 204407 17 18
table(ndf$y)
## < table of extent 0 >
ndf1=ndf
colnames(ndf1)[which(names(ndf1) == "visitors")] <- "y"
tail(ndf1)
## visit_date y
## 204872 118 24
## 204873 119 13
## 204874 120 30
## 204875 121 19
## 204876 122 21
## 204877 123 50
ndf1$y[ndf1$y<17]=0
ndf1$y[ndf1$y>=17]=1
summary(ndf1$y)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 1.0000 1.0000 0.8277 1.0000 1.0000
ndf1$y=as.numeric(ndf1$y)
head(ndf1)
## visit_date y
## 204402 11 1
## 204403 12 1
## 204404 13 1
## 204405 15 1
## 204406 16 1
## 204407 17 1
all.classifiers(ndf1)
## [1] "dtree2"
## avg error= 0.2758621
## avg Accuracy= 0.7241379
## avg Precision= 0.6976103
## avg recall= 0.8461503
## avg fscore= 0.763748
## [1] "dtree"
## avg error= 0.1750663
## avg Accuracy= 0.8249337
## avg Precision= 0.8249308
## avg recall= 0.9999958
## avg fscore= 0.9030198
## [1] "lr"
## avg error= 0.2148541
## avg Accuracy= 0.7851459
## avg Precision= 0.820778
## avg recall= 0.9439481
## avg fscore= 0.8698828
## [1] "nb"
## avg error= 0.2519894
## avg Accuracy= 0.7480106
## avg Precision= 0.7645651
## avg recall= 0.8797343
## avg fscore= 0.8104717
#timeseries.cv(ndf1, cl.name='dtree2', t.fold=16, get.performance=T)
The results are better. D tree gives the minimum error and best f score So, we choose this as our model and now apply on test set.
timeseries.cv(ndf1, cl.name='dtree', t.fold=16, get.performance=T, val=FALSE)
## [1] "dtree"
## avg error= 0.1803714
## avg Accuracy= 0.8196286
## avg Precision= 0.8196258
## avg recall= 0.9999958
## avg fscore= 0.8999587
We report this as our classifcation model results.
Now we also try autoregressive models to predict one week in advance. For this we only choose one particular restaurant that has all time series visitation information
ggplot(ndf, aes(visit_date,visitors)) +geom_line(col = "blue") +labs(y = "All visitors", x = "Date")
Some preprocessing and cleansing( removing outliers)
count_ts = ts(ndf[, c('visitors')])
ndf$clean_cnt = tsclean(count_ts)
ggplot() +geom_line(data = ndf, aes(x = visit_date, y = clean_cnt)) + ylab('!!!Cleaned!!!')
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
ndf$cnt_ma = ma(ndf$clean_cnt, order=7) # clean the dataset
ndf$cnt_ma30 = ma(ndf$clean_cnt, order=3)
ggplot() +
geom_line(data = ndf, aes(x = visit_date, y = clean_cnt, colour = "Counts")) +
geom_line(data = ndf, aes(x = visit_date, y = cnt_ma, colour = "Weekly MA")) +
geom_line(data = ndf, aes(x = visit_date, y = cnt_ma30, colour = "Three Days MA")) +
ylab('Count')
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
head(ndf)
## visit_date visitors clean_cnt cnt_ma cnt_ma30
## 204402 11 34 34 NA NA
## 204403 12 43 43 NA 35.66667
## 204404 13 30 30 NA 37.00000
## 204405 15 38 38 29.00000 30.00000
## 204406 16 22 22 30.85714 26.00000
## 204407 17 18 18 31.42857 19.33333
#deseasonalized series
count_ma = ts(na.omit(ndf$cnt_ma30), frequency=7)
decomp = stl(count_ma, s.window="periodic")
deseasonal_cnt <- seasadj(decomp)
plot(decomp)
adf.test(count_ma, alternative = "stationary")
## Warning in adf.test(count_ma, alternative = "stationary"): p-value smaller
## than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: count_ma
## Dickey-Fuller = -8.6512, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
Acf(count_ma, main='')
Pacf(count_ma, main='')
count_d1 = diff(deseasonal_cnt, differences = 1)
plot(count_d1)
adf.test(count_d1, alternative = "stationary")
## Warning in adf.test(count_d1, alternative = "stationary"): p-value smaller
## than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: count_d1
## Dickey-Fuller = -10.736, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
Acf(count_d1, main='ACF for Differenced Series')
Pacf(count_d1, main='PACF for Differenced Series')
auto.arima(deseasonal_cnt, seasonal=FALSE)
## Series: deseasonal_cnt
## ARIMA(0,0,2) with non-zero mean
##
## Coefficients:
## ma1 ma2 mean
## 1.0262 0.9715 31.7685
## s.e. 0.0156 0.0108 0.5632
##
## sigma^2 estimated as 16.91: log likelihood=-1344.29
## AIC=2696.59 AICc=2696.67 BIC=2713.23
fit<-auto.arima(deseasonal_cnt, seasonal=FALSE)
tsdisplay(residuals(fit), lag.max=45, main='(1,1,1) Model Residuals')
fit2 = arima(deseasonal_cnt, order=c(1,1,7))
fit2
##
## Call:
## arima(x = deseasonal_cnt, order = c(1, 1, 7))
##
## Coefficients:
## ar1 ma1 ma2 ma3 ma4 ma5 ma6 ma7
## -0.1923 0.3782 -0.054 -1.0965 -0.3096 0.0572 0.1003 -0.0756
## s.e. 0.2775 0.2751 0.077 0.0568 0.3102 0.0772 0.0555 0.0547
##
## sigma^2 estimated as 15.78: log likelihood = -1331.31, aic = 2680.63
tsdisplay(residuals(fit2), lag.max=15, main='Seasonal Model Residuals')
fcast <- forecast(fit2, h=7)
plot(fcast)
holdout the last month and start predicting on that. We got ARIMA parameters order from the ACF and PACF plots
hold <- window(ts(deseasonal_cnt), start=440)
fit_no_holdout = arima(ts(deseasonal_cnt[-c(440:470)]), order=c(1,1,7))
fcast_no_holdout <- forecast(fit_no_holdout,h=31)
y_=fcast_no_holdout$mean
test=ndf[440:470,]
y=test$y
y
## NULL
tail(ndf)
## visit_date visitors clean_cnt cnt_ma cnt_ma30
## 204872 118 24 24 36.28571 33.00000
## 204873 119 13 13 33.57143 22.33333
## 204874 120 30 30 31.28571 20.66667
## 204875 121 19 19 NA 23.33333
## 204876 122 21 21 NA 30.00000
## 204877 123 50 50 NA NA
plot(fcast_no_holdout, main=" ")
lines(ts(deseasonal_cnt))
y_[y_<17]=0
y_[y_>=17]=1
y[y<17]=0
y[y>=17]=1
#table(y,y_)
accuracy, precision, f score when regression result is used as classficiation classes.
Regression results:
fit_w_seasonality = auto.arima(deseasonal_cnt, seasonal=TRUE)
fit_w_seasonality
## Series: deseasonal_cnt
## ARIMA(1,0,1)(1,0,2)[7] with non-zero mean
##
## Coefficients:
## ar1 ma1 sar1 sma1 sma2 mean
## 0.6131 0.2628 0.9124 -0.8922 0.1159 31.5342
## s.e. 0.0424 0.0405 0.0428 0.0627 0.0479 1.7050
##
## sigma^2 estimated as 24.61: log likelihood=-1430.22
## AIC=2874.45 AICc=2874.69 BIC=2903.58
We used Prophet to predict the future visitation dates Prophet is the library that was developed by Facebook. It uses Additive Regression model. It automatically detects the changes in the trend, find some seasonal patterns
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
ndf_p <- read.csv("ndf_1.csv")
ndf_p <- ndf_p[, c('visit_date', 'visitors')]
setnames(ndf_p, old=c("visit_date","visitors"), new=c("ds", "y"))
head(ndf_p)
## ds y
## 1 2016-01-01 34
## 2 2016-01-02 43
## 3 2016-01-03 30
## 4 2016-01-05 38
## 5 2016-01-06 22
## 6 2016-01-07 18
library(prophet)
## Loading required package: Rcpp
m <- prophet(ndf_p)
## Disabling yearly seasonality. Run prophet with yearly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
## Initial log joint probability = -12.744
## Optimization terminated normally:
## Convergence detected: absolute parameter change was below tolerance
future <- make_future_dataframe(m, periods = 31)
forecast <- predict(m, future)
plot(m, forecast)
yhat column contains the forecast including additional columns like uncertainty intervals and seasonal components of the forecast
tail(forecast[c('ds', 'yhat', 'yhat_lower', 'yhat_upper')])
## ds yhat yhat_lower yhat_upper
## 502 2017-05-18 22.90804 9.623232 36.38780
## 503 2017-05-19 30.96694 17.196381 45.09868
## 504 2017-05-20 57.42978 42.647437 71.70871
## 505 2017-05-21 51.28849 37.397565 64.73551
## 506 2017-05-22 22.98990 9.327725 38.23588
## 507 2017-05-23 21.47555 6.366852 35.02161
y_=forecast$yhat[440:470]
test=ndf[440:470,]
y=test$y
y_[y_<17]=0
y_[y_>=17]=1
y[y<17]=0
y[y>=17]=1
#table(y,y_)
This component allow us to see the forecast that has been broken into trends, weekly trends. As our data analysis part showed, more visitors go to the restaurants during the weekend (plus on Fridays)
prophet_plot_components(m, forecast)
Many features seemed to be very relevant for classficiation, but turned out to be not that useful, and date information alone is very good predictor for finding customer traffic.
We later realise for the restaurant data we have used, the classes are not balanced i.e. thier median is not 17, and if we run on thier median of the hotel, we are getting bad results with classification but still good results with autoregressive models. The immideate future work needs finetune for classification models. Also, time series cross validation for the autoregressive models is yet to be implimented, like for classification we did. We have worked on weather data a lot but final manual change of names would take a lot of time before combining. We are not left with time, and hence we submit our code here.